/*******************************************************************************
																				
	DESCRIPTION: 	This do file creates the dataset with IQ variables				
	
*******************************************************************************/

clear all
global id_code 001_10

**********************************************************************
* A1: Cleaning the first dataset - for individuals born until 1965 (inclusive)
*     Renaming and destringing data
**********************************************************************

use "${IQData}/Enlistment_old.dta", clear

* Keep only relevant variables 
keep Lop* INST PPRF_PF PPRF_PGRP

* Rename variables
rename PPRF_PF non_cognit_old
rename PPRF_PGRP cognit_old

* Replace observations with weird values
replace non_cognit_old="" if non_cognit_old=="A" | non_cognit_old=="B" | non_cognit_old=="C" // 1 272 changes made
replace cognit_old="" if cognit_old=="┬░" // 1 change made

* Destring variables
destring INST non_cognit_old cognit_old, replace

**********************************************************************
* A2: Cleaning the first dataset - for individuals born until 1965 (inclusive)
*     REmoving duplicates
**********************************************************************

duplicates drop // 784,991  observations deleted

* Some duplicates are people with the same scores but different enlistment date
duplicates drop Lop* non_cognit_old cognit_old, force // 2043 observations deleted

* Drop of both variables of interets are missing
drop if non_cognit_old==. & cognit_old==. // 79 842 observations deleted

* Drop the duplicate for which one of the observations has missings
bys Lop*: egen duplicates=count(LopNr_PersonNr)
drop if non_cognit_old==. & duplicates>1 //69 deleted
drop if cognit_old==. & duplicates>1 // 6 deleted
drop duplicates

duplicates report Lop* // still duplicates but very few: surplus of 567

* For the remaining duplicates there is no clear pattern so we drop both observations if they belong to a duplicate pair
bys Lop*: egen duplicates=count(LopNr_PersonNr)
drop if duplicates>1 // 1133 observations deleted
drop duplicates

keep Lop* non_cognit_old cognit_old

save "${data_intermediate}/${id_code}_IQData_old.dta", replace

**********************************************************************
* A3: Cleaning the second dataset - for individuals born between 1966 and 1979
*     Renaming and destringing data
**********************************************************************

use "${IQData}/Enlistment_new.dta", clear

keep Lop* Gkap
destring Gkap, replace
rename Gkap cognit_new

duplicates drop // no duplicates
duplicates report Lop* // no duplicates in terms of individual

drop if cognit_new==. // 14 204 observations deleted

save "${data_intermediate}/${id_code}_IQData_new.dta", replace

**********************************************************************
* A4: Creating one dataset
**********************************************************************

use "${data_intermediate}/${id_code}_IQData_old.dta", clear
append using "${data_intermediate}/${id_code}_IQData_new.dta"

duplicates drop // no duplicates
duplicates report Lop* // surplus of 80

* Some duplicates have the same cognitive score, 
* but one comes from the old dataset and the other from the new dataset
* we keep only one observation in a duplicate pair
bys Lop* : egen duplicates=count(LopNr_PersonNr)
by Lop* : egen cognit_new_dup=mean(cognit_new)
by Lop* : gen cognit_sam=(cognit_new_dup==cognit_old)
by Lop* : egen cognit_sam2=max(cognit_sam)
drop if duplicates>1 & cognit_sam2==1 & cognit_old==. 
drop cognit_sam* cognit_new_dup duplicates

duplicates report Lop* // surplus of 26

* The source of the remaining duplicates is unclear.
* We drop both observations in a duplicate pair
bys Lop* : egen duplicates=count(LopNr_PersonNr)
drop if duplicates>1
drop duplicates

* rename variables
rename non_cognit_old non_cognit
gen cognit = cognit_old
replace cognit = cognit_new if cognit_old==.

* Generate dummies that are used in the machine learning model.
* Note: the variables from the two datasets are on a slightly different scale
* (the alternative would be to standardise the variables)
foreach var in cognit non_cognit {
	gen `var'_dummy1 = (`var'>=0 & `var'<=3)
	gen `var'_dummy2 = (`var'>=4 & `var'<=6)
	gen `var'_dummy3 = (`var'>=7 & `var'<=9)
}

drop non_cognit cognit_new cognit_old cognit

save "${data_intermediate}/${id_code}_IQData_clean.dta", replace